In [102]:
import numpy as np
import pandas as pd
df=pd.read_csv(r'/Users/binodrai/Desktop/1718991964134-9073-customer_churn_data.csv')
In [103]:
df.head(5)
Out[103]:
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | CUST0000 | Male | 0 | No | Yes | 23 | No | No phone service | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Month-to-month | Yes | Bank transfer | 49.85 | 1146.55 | No |
| 1 | CUST0001 | Female | 0 | Yes | No | 43 | No | No phone service | DSL | Yes | ... | Yes | No | Yes | No | Month-to-month | No | Mailed check | 100.70 | 4330.10 | Yes |
| 2 | CUST0002 | Male | 1 | No | No | 51 | Yes | No | DSL | No | ... | Yes | Yes | No | No | One year | No | Electronic check | 97.33 | 4963.83 | Yes |
| 3 | CUST0003 | Male | 1 | No | No | 72 | Yes | Yes | DSL | Yes | ... | Yes | No | No | No | Month-to-month | No | Credit card | 101.38 | 7299.36 | No |
| 4 | CUST0004 | Male | 1 | No | No | 25 | Yes | Yes | DSL | No | ... | No | Yes | No | Yes | Month-to-month | No | Electronic check | 52.22 | 1305.50 | Yes |
5 rows × 21 columns
In [104]:
df.tail()
Out[104]:
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5875 | CUST5875 | Male | 0 | Yes | Yes | 71 | Yes | No | DSL | Yes | ... | No | Yes | No | No | Month-to-month | Yes | Mailed check | 74.21 | 5268.91 | No |
| 5876 | CUST5876 | Male | 0 | No | No | 22 | Yes | Yes | DSL | No | ... | Yes | Yes | No | Yes | One year | No | Mailed check | 65.43 | 1439.46 | Yes |
| 5877 | CUST5877 | Female | 0 | No | No | 68 | No | No phone service | Fiber optic | Yes | ... | Yes | Yes | Yes | No | One year | Yes | Bank transfer | 59.78 | 4065.04 | No |
| 5878 | CUST5878 | Female | 0 | Yes | Yes | 14 | No | No phone service | Fiber optic | Yes | ... | Yes | Yes | Yes | Yes | Month-to-month | Yes | Mailed check | 91.88 | 1286.32 | No |
| 5879 | CUST5879 | Female | 1 | Yes | No | 23 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Month-to-month | No | Mailed check | 25.45 | 585.35 | Yes |
5 rows × 21 columns
In [7]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5880 entries, 0 to 5879 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerID 5880 non-null object 1 gender 5880 non-null object 2 SeniorCitizen 5880 non-null int64 3 Partner 5880 non-null object 4 Dependents 5880 non-null object 5 tenure 5880 non-null int64 6 PhoneService 5880 non-null object 7 MultipleLines 5880 non-null object 8 InternetService 5880 non-null object 9 OnlineSecurity 5880 non-null object 10 OnlineBackup 5880 non-null object 11 DeviceProtection 5880 non-null object 12 TechSupport 5880 non-null object 13 StreamingTV 5880 non-null object 14 StreamingMovies 5880 non-null object 15 Contract 5880 non-null object 16 PaperlessBilling 5880 non-null object 17 PaymentMethod 5880 non-null object 18 MonthlyCharges 5880 non-null float64 19 TotalCharges 5880 non-null float64 20 Churn 5880 non-null object dtypes: float64(2), int64(2), object(17) memory usage: 964.8+ KB
In [8]:
df.describe()
Out[8]:
| SeniorCitizen | tenure | MonthlyCharges | TotalCharges | |
|---|---|---|---|---|
| count | 5880.000000 | 5880.000000 | 5880.000000 | 5880.000000 |
| mean | 0.500680 | 36.549150 | 70.157779 | 2566.813165 |
| std | 0.500042 | 20.909674 | 28.804615 | 1910.017743 |
| min | 0.000000 | 1.000000 | 20.000000 | 20.030000 |
| 25% | 0.000000 | 18.000000 | 45.717500 | 1020.217500 |
| 50% | 1.000000 | 37.000000 | 70.155000 | 2136.445000 |
| 75% | 1.000000 | 55.000000 | 95.457500 | 3767.665000 |
| max | 1.000000 | 72.000000 | 119.990000 | 8589.600000 |
In [9]:
df.shape
Out[9]:
(5880, 21)
In [10]:
df.dtypes
Out[10]:
customerID object gender object SeniorCitizen int64 Partner object Dependents object tenure int64 PhoneService object MultipleLines object InternetService object OnlineSecurity object OnlineBackup object DeviceProtection object TechSupport object StreamingTV object StreamingMovies object Contract object PaperlessBilling object PaymentMethod object MonthlyCharges float64 TotalCharges float64 Churn object dtype: object
In [11]:
df.nunique()
Out[11]:
customerID 5880 gender 2 SeniorCitizen 2 Partner 2 Dependents 2 tenure 72 PhoneService 2 MultipleLines 3 InternetService 3 OnlineSecurity 3 OnlineBackup 3 DeviceProtection 3 TechSupport 3 StreamingTV 3 StreamingMovies 3 Contract 3 PaperlessBilling 2 PaymentMethod 4 MonthlyCharges 4439 TotalCharges 5794 Churn 2 dtype: int64
In [23]:
df['gender'].value_counts(ascending=True)
Out[23]:
gender Female 2930 Male 2950 Name: count, dtype: int64
In [24]:
df.isnull().sum() #detect missing value in dataset
Out[24]:
customerID 0 gender 0 SeniorCitizen 0 Partner 0 Dependents 0 tenure 0 PhoneService 0 MultipleLines 0 InternetService 0 OnlineSecurity 0 OnlineBackup 0 DeviceProtection 0 TechSupport 0 StreamingTV 0 StreamingMovies 0 Contract 0 PaperlessBilling 0 PaymentMethod 0 MonthlyCharges 0 TotalCharges 0 Churn 0 dtype: int64
In [25]:
df.nunique() #to know unique entries in categoraical column,understanding range of categories present
Out[25]:
customerID 5880 gender 2 SeniorCitizen 2 Partner 2 Dependents 2 tenure 72 PhoneService 2 MultipleLines 3 InternetService 3 OnlineSecurity 3 OnlineBackup 3 DeviceProtection 3 TechSupport 3 StreamingTV 3 StreamingMovies 3 Contract 3 PaperlessBilling 2 PaymentMethod 4 MonthlyCharges 4439 TotalCharges 5794 Churn 2 dtype: int64
In [27]:
df.columns
Out[27]:
Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
dtype='object')
In [36]:
correlation = df['MonthlyCharges'].corr(df['TotalCharges']).round(5)#only between numerical columns
print(f"Correlation between MonthlyCharges and TotalCharges: {correlation}")
Correlation between MonthlyCharges and TotalCharges: 0.5567
In [41]:
drop_column=df.drop('PaperlessBilling',axis=1)#axis 1 represent column have to specify drop in column or row
In [42]:
df.dtypes
Out[42]:
customerID object gender object SeniorCitizen int64 Partner object Dependents object tenure int64 PhoneService object MultipleLines object InternetService object OnlineSecurity object OnlineBackup object DeviceProtection object TechSupport object StreamingTV object StreamingMovies object Contract object PaperlessBilling object PaymentMethod object MonthlyCharges float64 TotalCharges float64 Churn int64 dtype: object
In [101]:
sort_Totalcharges=df.sort_values(by='TotalCharges',ascending=False)# can i show just total charges?
a=sort_Totalcharges['TotalCharges']
a.head()
Out[101]:
1765 8589.60 5082 8505.09 410 8498.16 1362 8478.00 5349 8449.20 Name: TotalCharges, dtype: float64
In [49]:
df.sample(2)
Out[49]:
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2190 | CUST2190 | Female | 0 | Yes | No | 9 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | One year | Yes | Mailed check | 38.31 | 344.79 | 1 |
| 3221 | CUST3221 | Female | 0 | Yes | Yes | 65 | No | No phone service | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Credit card | 103.71 | 6741.15 | 1 |
2 rows × 21 columns
In [52]:
grouped=df.groupby('gender').agg({'MonthlyCharges':'mean','tenure':'count'})
print(grouped)
MonthlyCharges tenure gender Female 69.888218 2930 Male 70.425512 2950
In [53]:
df.fillna(value=0, inplace=True) # Replace NaN with 0
print(df)
customerID gender SeniorCitizen Partner Dependents tenure \
0 CUST0000 Male 0 No Yes 23
1 CUST0001 Female 0 Yes No 43
2 CUST0002 Male 1 No No 51
3 CUST0003 Male 1 No No 72
4 CUST0004 Male 1 No No 25
... ... ... ... ... ... ...
5875 CUST5875 Male 0 Yes Yes 71
5876 CUST5876 Male 0 No No 22
5877 CUST5877 Female 0 No No 68
5878 CUST5878 Female 0 Yes Yes 14
5879 CUST5879 Female 1 Yes No 23
PhoneService MultipleLines InternetService OnlineSecurity ... \
0 No No phone service No No internet service ...
1 No No phone service DSL Yes ...
2 Yes No DSL No ...
3 Yes Yes DSL Yes ...
4 Yes Yes DSL No ...
... ... ... ... ... ...
5875 Yes No DSL Yes ...
5876 Yes Yes DSL No ...
5877 No No phone service Fiber optic Yes ...
5878 No No phone service Fiber optic Yes ...
5879 Yes No No No internet service ...
DeviceProtection TechSupport StreamingTV \
0 No internet service No internet service No internet service
1 Yes No Yes
2 Yes Yes No
3 Yes No No
4 No Yes No
... ... ... ...
5875 No Yes No
5876 Yes Yes No
5877 Yes Yes Yes
5878 Yes Yes Yes
5879 No internet service No internet service No internet service
StreamingMovies Contract PaperlessBilling PaymentMethod \
0 No internet service Month-to-month Yes Bank transfer
1 No Month-to-month No Mailed check
2 No One year No Electronic check
3 No Month-to-month No Credit card
4 Yes Month-to-month No Electronic check
... ... ... ... ...
5875 No Month-to-month Yes Mailed check
5876 Yes One year No Mailed check
5877 No One year Yes Bank transfer
5878 Yes Month-to-month Yes Mailed check
5879 No internet service Month-to-month No Mailed check
MonthlyCharges TotalCharges Churn
0 49.85 1146.55 0
1 100.70 4330.10 1
2 97.33 4963.83 1
3 101.38 7299.36 0
4 52.22 1305.50 1
... ... ... ...
5875 74.21 5268.91 0
5876 65.43 1439.46 1
5877 59.78 4065.04 0
5878 91.88 1286.32 0
5879 25.45 585.35 1
[5880 rows x 21 columns]
In [54]:
df.isna()
Out[54]:
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 1 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 2 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 3 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 4 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5875 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 5876 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 5877 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 5878 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 5879 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
5880 rows × 21 columns
In [56]:
df.fillna(value=0, inplace=True) # Replace NaN with 0
print(df) #there no nan value present here
customerID gender SeniorCitizen Partner Dependents tenure \
0 CUST0000 Male 0 No Yes 23
1 CUST0001 Female 0 Yes No 43
2 CUST0002 Male 1 No No 51
3 CUST0003 Male 1 No No 72
4 CUST0004 Male 1 No No 25
... ... ... ... ... ... ...
5875 CUST5875 Male 0 Yes Yes 71
5876 CUST5876 Male 0 No No 22
5877 CUST5877 Female 0 No No 68
5878 CUST5878 Female 0 Yes Yes 14
5879 CUST5879 Female 1 Yes No 23
PhoneService MultipleLines InternetService OnlineSecurity ... \
0 No No phone service No No internet service ...
1 No No phone service DSL Yes ...
2 Yes No DSL No ...
3 Yes Yes DSL Yes ...
4 Yes Yes DSL No ...
... ... ... ... ... ...
5875 Yes No DSL Yes ...
5876 Yes Yes DSL No ...
5877 No No phone service Fiber optic Yes ...
5878 No No phone service Fiber optic Yes ...
5879 Yes No No No internet service ...
DeviceProtection TechSupport StreamingTV \
0 No internet service No internet service No internet service
1 Yes No Yes
2 Yes Yes No
3 Yes No No
4 No Yes No
... ... ... ...
5875 No Yes No
5876 Yes Yes No
5877 Yes Yes Yes
5878 Yes Yes Yes
5879 No internet service No internet service No internet service
StreamingMovies Contract PaperlessBilling PaymentMethod \
0 No internet service Month-to-month Yes Bank transfer
1 No Month-to-month No Mailed check
2 No One year No Electronic check
3 No Month-to-month No Credit card
4 Yes Month-to-month No Electronic check
... ... ... ... ...
5875 No Month-to-month Yes Mailed check
5876 Yes One year No Mailed check
5877 No One year Yes Bank transfer
5878 Yes Month-to-month Yes Mailed check
5879 No internet service Month-to-month No Mailed check
MonthlyCharges TotalCharges Churn
0 49.85 1146.55 0
1 100.70 4330.10 1
2 97.33 4963.83 1
3 101.38 7299.36 0
4 52.22 1305.50 1
... ... ... ...
5875 74.21 5268.91 0
5876 65.43 1439.46 1
5877 59.78 4065.04 0
5878 91.88 1286.32 0
5879 25.45 585.35 1
[5880 rows x 21 columns]
In [57]:
df['Partner'].replace({'Yes': 1, 'No': 0}, inplace=True) # Replace 'Yes' with 1 and 'No' with 0 in 'Partner' column
print(df['Partner'])
0 0
1 1
2 0
3 0
4 0
..
5875 1
5876 0
5877 0
5878 1
5879 1
Name: Partner, Length: 5880, dtype: int64
In [154]:
df.describe()
Out[154]:
| SeniorCitizen | tenure | MonthlyCharges | TotalCharges | |
|---|---|---|---|---|
| count | 5880.000000 | 5880.000000 | 5880.000000 | 5880.000000 |
| mean | 0.500680 | 36.549150 | 70.157779 | 2565.315153 |
| std | 0.500042 | 20.909674 | 28.804615 | 1905.670601 |
| min | 0.000000 | 1.000000 | 20.000000 | 20.030000 |
| 25% | 0.000000 | 18.000000 | 45.717500 | 1020.217500 |
| 50% | 1.000000 | 37.000000 | 70.155000 | 2136.445000 |
| 75% | 1.000000 | 55.000000 | 95.457500 | 3767.665000 |
| max | 1.000000 | 72.000000 | 119.990000 | 7888.836250 |
In [155]:
df.index
Out[155]:
RangeIndex(start=0, stop=5880, step=1)
In [153]:
import plotly.graph_objects as go
# Define index for x-axis
x = df.index
# Create line plots for each column
fig = go.Figure()
# Plot SeniorCitizen
fig.add_trace(go.Scatter(x=x, y=df['SeniorCitizen'], mode='lines+markers', name='SeniorCitizen'))
# Plot Tenure
fig.add_trace(go.Scatter(x=x, y=df['tenure'], mode='lines+markers', name='Tenure'))
# Plot MonthlyCharges
fig.add_trace(go.Scatter(x=x, y=df['MonthlyCharges'], mode='lines+markers', name='MonthlyCharges'))
# Plot TotalCharges
fig.add_trace(go.Scatter(x=x, y=df['TotalCharges'], mode='lines+markers', name='TotalCharges'))
# Customize plot
fig.update_layout(title='DataFrame Line Plots', xaxis_title='Index', yaxis_title='Values', legend_title='Columns')
# Show plot
fig.show()
In [159]:
import pandas as pd
import matplotlib.pyplot as plt
# Cross-tabulation of Gender vs Review
gender_review_ct = pd.crosstab(df['gender'], df['InternetService'])
print(gender_review_ct)
gender_review_ct.plot(kind='bar', stacked=True, title='Usage of Internet Services By Gender')
plt.xlabel('gender')
plt.ylabel('Count')
plt.show()
InternetService DSL Fiber optic No gender Female 938 957 1035 Male 998 958 994
In [151]:
df.rename(columns={'gender':'Sex'})
Out[151]:
| customerID | Sex | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | CUST0000 | Male | 0 | No | Yes | 23 | No | No phone service | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Month-to-month | Yes | Bank transfer | 49.85 | 1146.55 | No |
| 1 | CUST0001 | Female | 0 | Yes | No | 43 | No | No phone service | DSL | Yes | ... | Yes | No | Yes | No | Month-to-month | No | Mailed check | 100.70 | 4330.10 | Yes |
| 2 | CUST0002 | Male | 1 | No | No | 51 | Yes | No | DSL | No | ... | Yes | Yes | No | No | One year | No | Electronic check | 97.33 | 4963.83 | Yes |
| 3 | CUST0003 | Male | 1 | No | No | 72 | Yes | Yes | DSL | Yes | ... | Yes | No | No | No | Month-to-month | No | Credit card | 101.38 | 7299.36 | No |
| 4 | CUST0004 | Male | 1 | No | No | 25 | Yes | Yes | DSL | No | ... | No | Yes | No | Yes | Month-to-month | No | Electronic check | 52.22 | 1305.50 | Yes |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5875 | CUST5875 | Male | 0 | Yes | Yes | 71 | Yes | No | DSL | Yes | ... | No | Yes | No | No | Month-to-month | Yes | Mailed check | 74.21 | 5268.91 | No |
| 5876 | CUST5876 | Male | 0 | No | No | 22 | Yes | Yes | DSL | No | ... | Yes | Yes | No | Yes | One year | No | Mailed check | 65.43 | 1439.46 | Yes |
| 5877 | CUST5877 | Female | 0 | No | No | 68 | No | No phone service | Fiber optic | Yes | ... | Yes | Yes | Yes | No | One year | Yes | Bank transfer | 59.78 | 4065.04 | No |
| 5878 | CUST5878 | Female | 0 | Yes | Yes | 14 | No | No phone service | Fiber optic | Yes | ... | Yes | Yes | Yes | Yes | Month-to-month | Yes | Mailed check | 91.88 | 1286.32 | No |
| 5879 | CUST5879 | Female | 1 | Yes | No | 23 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Month-to-month | No | Mailed check | 25.45 | 585.35 | Yes |
5880 rows × 21 columns
In [105]:
df_2= df.loc[:,['MonthlyCharges','TotalCharges']]
df_2.head()
Out[105]:
| MonthlyCharges | TotalCharges | |
|---|---|---|
| 0 | 49.85 | 1146.55 |
| 1 | 100.70 | 4330.10 |
| 2 | 97.33 | 4963.83 |
| 3 | 101.38 | 7299.36 |
| 4 | 52.22 | 1305.50 |
In [106]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df_2)
df_scaled = scaler.transform(df_2)
df_scaled
Out[106]:
array([[-0.7050782 , -0.74364961],
[ 1.06041406, 0.92325676],
[ 0.94340897, 1.25507771],
...,
[-0.36031246, 0.78447138],
[ 0.75418708, -0.67046606],
[-1.55223673, -1.03749386]])
In [63]:
df_scaled = pd.DataFrame(df_scaled)
In [66]:
df_scaled.describe().round(0)
Out[66]:
| 0 | 1 | |
|---|---|---|
| count | 5880.0 | 5880.0 |
| mean | 0.0 | 0.0 |
| std | 1.0 | 1.0 |
| min | -2.0 | -1.0 |
| 25% | -1.0 | -1.0 |
| 50% | -0.0 | -0.0 |
| 75% | 1.0 | 1.0 |
| max | 2.0 | 3.0 |
In [109]:
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()
minmax.fit(df_2)#also renaming data frame
df_minmax = minmax.transform(df_2)
df_minmax
Out[109]:
array([[0.29852985, 0.13145584],
[0.80708071, 0.50295056],
[0.77337734, 0.57690176],
...,
[0.39783978, 0.47202018],
[0.71887189, 0.14776587],
[0.05450545, 0.0659683 ]])
In [110]:
df_minmax=pd.DataFrame(df_minmax)
df_minmax.describe()
Out[110]:
| 0 | 1 | |
|---|---|---|
| count | 5880.000000 | 5880.000000 |
| mean | 0.501628 | 0.297189 |
| std | 0.288075 | 0.222884 |
| min | 0.000000 | 0.000000 |
| 25% | 0.257201 | 0.116714 |
| 50% | 0.501600 | 0.246969 |
| 75% | 0.754650 | 0.437319 |
| max | 1.000000 | 1.000000 |
In [112]:
df.head()
Out[112]:
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | CUST0000 | Male | 0 | No | Yes | 23 | No | No phone service | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Month-to-month | Yes | Bank transfer | 49.85 | 1146.55 | No |
| 1 | CUST0001 | Female | 0 | Yes | No | 43 | No | No phone service | DSL | Yes | ... | Yes | No | Yes | No | Month-to-month | No | Mailed check | 100.70 | 4330.10 | Yes |
| 2 | CUST0002 | Male | 1 | No | No | 51 | Yes | No | DSL | No | ... | Yes | Yes | No | No | One year | No | Electronic check | 97.33 | 4963.83 | Yes |
| 3 | CUST0003 | Male | 1 | No | No | 72 | Yes | Yes | DSL | Yes | ... | Yes | No | No | No | Month-to-month | No | Credit card | 101.38 | 7299.36 | No |
| 4 | CUST0004 | Male | 1 | No | No | 25 | Yes | Yes | DSL | No | ... | No | Yes | No | Yes | Month-to-month | No | Electronic check | 52.22 | 1305.50 | Yes |
5 rows × 21 columns
In [113]:
method_counts = df['PaymentMethod'].value_counts()
In [114]:
method_counts
Out[114]:
PaymentMethod Credit card 1503 Electronic check 1488 Mailed check 1462 Bank transfer 1427 Name: count, dtype: int64
In [117]:
df.dtypes
Out[117]:
customerID object gender object SeniorCitizen int64 Partner object Dependents object tenure int64 PhoneService object MultipleLines object InternetService object OnlineSecurity object OnlineBackup object DeviceProtection object TechSupport object StreamingTV object StreamingMovies object Contract object PaperlessBilling object PaymentMethod object MonthlyCharges float64 TotalCharges float64 Churn object dtype: object
In [118]:
pd.get_dummies(df['PaymentMethod'],drop_first=True).astype(int).head(10)
Out[118]:
| Credit card | Electronic check | Mailed check | |
|---|---|---|---|
| 0 | 0 | 0 | 0 |
| 1 | 0 | 0 | 1 |
| 2 | 0 | 1 | 0 |
| 3 | 1 | 0 | 0 |
| 4 | 0 | 1 | 0 |
| 5 | 1 | 0 | 0 |
| 6 | 0 | 0 | 0 |
| 7 | 0 | 0 | 1 |
| 8 | 0 | 0 | 1 |
| 9 | 0 | 1 | 0 |
In [119]:
pd.get_dummies(df['PaymentMethod'],).astype(int).head(10)
Out[119]:
| Bank transfer | Credit card | Electronic check | Mailed check | |
|---|---|---|---|---|
| 0 | 1 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 | 1 |
| 2 | 0 | 0 | 1 | 0 |
| 3 | 0 | 1 | 0 | 0 |
| 4 | 0 | 0 | 1 | 0 |
| 5 | 0 | 1 | 0 | 0 |
| 6 | 1 | 0 | 0 | 0 |
| 7 | 0 | 0 | 0 | 1 |
| 8 | 0 | 0 | 0 | 1 |
| 9 | 0 | 0 | 1 | 0 |
In [120]:
df
Out[120]:
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | CUST0000 | Male | 0 | No | Yes | 23 | No | No phone service | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Month-to-month | Yes | Bank transfer | 49.85 | 1146.55 | No |
| 1 | CUST0001 | Female | 0 | Yes | No | 43 | No | No phone service | DSL | Yes | ... | Yes | No | Yes | No | Month-to-month | No | Mailed check | 100.70 | 4330.10 | Yes |
| 2 | CUST0002 | Male | 1 | No | No | 51 | Yes | No | DSL | No | ... | Yes | Yes | No | No | One year | No | Electronic check | 97.33 | 4963.83 | Yes |
| 3 | CUST0003 | Male | 1 | No | No | 72 | Yes | Yes | DSL | Yes | ... | Yes | No | No | No | Month-to-month | No | Credit card | 101.38 | 7299.36 | No |
| 4 | CUST0004 | Male | 1 | No | No | 25 | Yes | Yes | DSL | No | ... | No | Yes | No | Yes | Month-to-month | No | Electronic check | 52.22 | 1305.50 | Yes |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5875 | CUST5875 | Male | 0 | Yes | Yes | 71 | Yes | No | DSL | Yes | ... | No | Yes | No | No | Month-to-month | Yes | Mailed check | 74.21 | 5268.91 | No |
| 5876 | CUST5876 | Male | 0 | No | No | 22 | Yes | Yes | DSL | No | ... | Yes | Yes | No | Yes | One year | No | Mailed check | 65.43 | 1439.46 | Yes |
| 5877 | CUST5877 | Female | 0 | No | No | 68 | No | No phone service | Fiber optic | Yes | ... | Yes | Yes | Yes | No | One year | Yes | Bank transfer | 59.78 | 4065.04 | No |
| 5878 | CUST5878 | Female | 0 | Yes | Yes | 14 | No | No phone service | Fiber optic | Yes | ... | Yes | Yes | Yes | Yes | Month-to-month | Yes | Mailed check | 91.88 | 1286.32 | No |
| 5879 | CUST5879 | Female | 1 | Yes | No | 23 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Month-to-month | No | Mailed check | 25.45 | 585.35 | Yes |
5880 rows × 21 columns
In [124]:
df.describe().round(2)
Out[124]:
| SeniorCitizen | tenure | MonthlyCharges | TotalCharges | |
|---|---|---|---|---|
| count | 5880.0 | 5880.00 | 5880.00 | 5880.00 |
| mean | 0.5 | 36.55 | 70.16 | 2566.81 |
| std | 0.5 | 20.91 | 28.80 | 1910.02 |
| min | 0.0 | 1.00 | 20.00 | 20.03 |
| 25% | 0.0 | 18.00 | 45.72 | 1020.22 |
| 50% | 1.0 | 37.00 | 70.16 | 2136.44 |
| 75% | 1.0 | 55.00 | 95.46 | 3767.66 |
| max | 1.0 | 72.00 | 119.99 | 8589.60 |
In [126]:
upper_limit = df['tenure'].mean() + 3*df['tenure'].std()
lower_limit = df['tenure'].mean() - 3*df['tenure'].std()
In [127]:
upper_limit
Out[127]:
99.27817099498411
In [128]:
lower_limit
Out[128]:
-26.179871675256223
In [129]:
Outlier_zscore = df[(df['tenure'] >=upper_limit)|(df['tenure'] <=lower_limit)]
Outlier_zscore
Out[129]:
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn |
|---|
0 rows × 21 columns
In [130]:
upper_limit = df['TotalCharges'].mean() + 3*df['TotalCharges'].std()
lower_limit = df['TotalCharges'].mean() - 3*df['TotalCharges'].std()
In [131]:
upper_limit
Out[131]:
8296.866392560543
In [132]:
lower_limit
Out[132]:
-3163.2400626285716
In [136]:
Outlier_zscore = df[(df['TotalCharges'] >=upper_limit)|(df['TotalCharges'] <=lower_limit)]
Outlier_zscore
Out[136]:
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 410 | CUST0410 | Male | 1 | No | Yes | 72 | No | No phone service | Fiber optic | Yes | ... | Yes | No | No | No | Month-to-month | Yes | Credit card | 118.03 | 8498.16 | Yes |
| 764 | CUST0764 | Male | 1 | Yes | No | 71 | No | No phone service | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Month-to-month | Yes | Electronic check | 118.58 | 8419.18 | No |
| 1006 | CUST1006 | Female | 0 | Yes | No | 72 | No | No phone service | DSL | No | ... | No | Yes | Yes | Yes | Two year | Yes | Mailed check | 116.53 | 8390.16 | No |
| 1362 | CUST1362 | Male | 0 | No | No | 72 | Yes | Yes | DSL | No | ... | No | Yes | Yes | No | Two year | No | Credit card | 117.75 | 8478.00 | Yes |
| 1765 | CUST1765 | Female | 0 | Yes | No | 72 | Yes | Yes | Fiber optic | Yes | ... | Yes | Yes | No | Yes | One year | Yes | Bank transfer | 119.30 | 8589.60 | No |
| 5082 | CUST5082 | Male | 0 | No | No | 71 | Yes | No | Fiber optic | Yes | ... | No | No | No | Yes | One year | Yes | Bank transfer | 119.79 | 8505.09 | No |
| 5349 | CUST5349 | Male | 1 | No | Yes | 72 | Yes | No | Fiber optic | No | ... | Yes | No | Yes | Yes | Month-to-month | Yes | Credit card | 117.35 | 8449.20 | No |
| 5476 | CUST5476 | Male | 1 | Yes | No | 71 | Yes | Yes | DSL | No | ... | Yes | No | Yes | No | One year | Yes | Mailed check | 118.74 | 8430.54 | No |
8 rows × 21 columns
In [138]:
trimmed_df = df[(df['TotalCharges'] <=upper_limit) & (df['TotalCharges'] >=lower_limit)] #trimming using Z-score
trimmed_df.shape
Out[138]:
(5872, 21)
In [139]:
df=pd.read_csv(r'/Users/binodrai/Desktop/1718991964134-9073-customer_churn_data.csv')
In [141]:
# IQR METHOD ::Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = df['TotalCharges'].quantile(0.25)
Q3 = df['TotalCharges'].quantile(0.75)
IQR = Q3 - Q1
# Define lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
In [142]:
lower_bound
Out[142]:
-3100.9537500000006
In [143]:
upper_bound
Out[143]:
7888.83625
In [145]:
###Outlier IQR score
Outlier_zscore
Outlier_iqr = df[(df['TotalCharges'] >=upper_bound)|(df['TotalCharges'] <=lower_bound)]
Outlier_iqr
Out[145]:
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 83 | CUST0083 | Female | 1 | Yes | No | 71 | Yes | Yes | Fiber optic | Yes | ... | No | Yes | No | No | One year | No | Bank transfer | 112.54 | 7990.34 | Yes |
| 410 | CUST0410 | Male | 1 | No | Yes | 72 | No | No phone service | Fiber optic | Yes | ... | Yes | No | No | No | Month-to-month | Yes | Credit card | 118.03 | 8498.16 | Yes |
| 650 | CUST0650 | Male | 0 | Yes | Yes | 72 | No | No phone service | Fiber optic | No | ... | No | Yes | No | Yes | Month-to-month | Yes | Mailed check | 111.87 | 8054.64 | No |
| 764 | CUST0764 | Male | 1 | Yes | No | 71 | No | No phone service | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Month-to-month | Yes | Electronic check | 118.58 | 8419.18 | No |
| 1006 | CUST1006 | Female | 0 | Yes | No | 72 | No | No phone service | DSL | No | ... | No | Yes | Yes | Yes | Two year | Yes | Mailed check | 116.53 | 8390.16 | No |
| 1362 | CUST1362 | Male | 0 | No | No | 72 | Yes | Yes | DSL | No | ... | No | Yes | Yes | No | Two year | No | Credit card | 117.75 | 8478.00 | Yes |
| 1598 | CUST1598 | Female | 1 | No | Yes | 68 | No | No phone service | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Month-to-month | No | Electronic check | 118.03 | 8026.04 | No |
| 1631 | CUST1631 | Male | 1 | No | Yes | 69 | Yes | Yes | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Month-to-month | Yes | Bank transfer | 115.61 | 7977.09 | No |
| 1765 | CUST1765 | Female | 0 | Yes | No | 72 | Yes | Yes | Fiber optic | Yes | ... | Yes | Yes | No | Yes | One year | Yes | Bank transfer | 119.30 | 8589.60 | No |
| 1976 | CUST1976 | Male | 0 | No | No | 67 | No | No phone service | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | Yes | Mailed check | 118.63 | 7948.21 | No |
| 2218 | CUST2218 | Female | 1 | No | Yes | 70 | No | No phone service | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Month-to-month | Yes | Electronic check | 117.34 | 8213.80 | Yes |
| 2225 | CUST2225 | Male | 0 | Yes | No | 69 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | Yes | Credit card | 117.39 | 8099.91 | Yes |
| 2314 | CUST2314 | Male | 1 | No | Yes | 72 | Yes | No | Fiber optic | No | ... | No | No | Yes | No | One year | Yes | Bank transfer | 111.85 | 8053.20 | Yes |
| 2818 | CUST2818 | Male | 1 | Yes | No | 67 | Yes | Yes | Fiber optic | Yes | ... | Yes | Yes | Yes | Yes | Two year | Yes | Electronic check | 118.89 | 7965.63 | No |
| 3060 | CUST3060 | Female | 1 | No | No | 70 | No | No phone service | Fiber optic | Yes | ... | No | Yes | No | No | Two year | No | Bank transfer | 117.75 | 8242.50 | No |
| 3302 | CUST3302 | Male | 1 | Yes | Yes | 68 | Yes | No | DSL | Yes | ... | Yes | Yes | No | No | Two year | No | Credit card | 118.25 | 8041.00 | Yes |
| 3414 | CUST3414 | Male | 1 | No | No | 71 | No | No phone service | DSL | No | ... | Yes | No | Yes | Yes | Month-to-month | Yes | Electronic check | 116.80 | 8292.80 | No |
| 3473 | CUST3473 | Female | 0 | No | No | 68 | Yes | Yes | DSL | Yes | ... | No | No | No | Yes | Month-to-month | Yes | Bank transfer | 118.49 | 8057.32 | Yes |
| 3707 | CUST3707 | Female | 1 | No | Yes | 69 | No | No phone service | Fiber optic | Yes | ... | No | No | Yes | No | One year | No | Mailed check | 117.38 | 8099.22 | Yes |
| 3775 | CUST3775 | Female | 1 | Yes | Yes | 67 | No | No phone service | DSL | Yes | ... | Yes | Yes | Yes | No | Two year | Yes | Bank transfer | 118.91 | 7966.97 | Yes |
| 3789 | CUST3789 | Male | 1 | Yes | No | 71 | No | No phone service | DSL | No | ... | No | Yes | No | Yes | Two year | Yes | Mailed check | 112.14 | 7961.94 | No |
| 4094 | CUST4094 | Female | 0 | Yes | Yes | 69 | Yes | Yes | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 119.99 | 8279.31 | Yes |
| 4437 | CUST4437 | Female | 1 | Yes | No | 72 | Yes | No | Fiber optic | No | ... | Yes | Yes | No | Yes | Month-to-month | Yes | Bank transfer | 112.80 | 8121.60 | No |
| 4638 | CUST4638 | Female | 0 | Yes | No | 69 | Yes | Yes | DSL | Yes | ... | No | No | Yes | No | Two year | No | Credit card | 114.42 | 7894.98 | Yes |
| 4718 | CUST4718 | Female | 0 | No | Yes | 71 | Yes | Yes | Fiber optic | Yes | ... | Yes | Yes | No | No | Two year | No | Credit card | 115.53 | 8202.63 | No |
| 4774 | CUST4774 | Male | 0 | No | Yes | 68 | No | No phone service | Fiber optic | No | ... | No | No | Yes | Yes | One year | Yes | Credit card | 117.36 | 7980.48 | Yes |
| 4872 | CUST4872 | Male | 1 | Yes | Yes | 69 | No | No phone service | DSL | No | ... | No | No | No | Yes | Two year | No | Credit card | 116.72 | 8053.68 | No |
| 5082 | CUST5082 | Male | 0 | No | No | 71 | Yes | No | Fiber optic | Yes | ... | No | No | No | Yes | One year | Yes | Bank transfer | 119.79 | 8505.09 | No |
| 5257 | CUST5257 | Male | 0 | No | Yes | 68 | No | No phone service | DSL | No | ... | No | Yes | Yes | Yes | One year | Yes | Credit card | 116.22 | 7902.96 | No |
| 5349 | CUST5349 | Male | 1 | No | Yes | 72 | Yes | No | Fiber optic | No | ... | Yes | No | Yes | Yes | Month-to-month | Yes | Credit card | 117.35 | 8449.20 | No |
| 5476 | CUST5476 | Male | 1 | Yes | No | 71 | Yes | Yes | DSL | No | ... | Yes | No | Yes | No | One year | Yes | Mailed check | 118.74 | 8430.54 | No |
| 5704 | CUST5704 | Female | 1 | No | Yes | 71 | Yes | No | DSL | Yes | ... | Yes | Yes | No | No | One year | No | Mailed check | 113.59 | 8064.89 | No |
32 rows × 21 columns
In [146]:
###tRIMMING IQR score
# Using IQR for trimming, assuming we already have lower_bound and upper_bound calculated
trimmed_df = df[(df['TotalCharges'] <=upper_bound) & (df['TotalCharges'] >=lower_bound)]
trimmed_df.shape
Out[146]:
(5848, 21)
In [147]:
##Capping using IQR
df['TotalCharges'] = np.where(
df['TotalCharges']>upper_bound,
upper_bound,
np.where(
df['TotalCharges']<lower_bound,
lower_bound,
df['TotalCharges']
)
)
In [148]:
df.describe().round(2)
Out[148]:
| SeniorCitizen | tenure | MonthlyCharges | TotalCharges | |
|---|---|---|---|---|
| count | 5880.0 | 5880.00 | 5880.00 | 5880.00 |
| mean | 0.5 | 36.55 | 70.16 | 2565.32 |
| std | 0.5 | 20.91 | 28.80 | 1905.67 |
| min | 0.0 | 1.00 | 20.00 | 20.03 |
| 25% | 0.0 | 18.00 | 45.72 | 1020.22 |
| 50% | 1.0 | 37.00 | 70.16 | 2136.44 |
| 75% | 1.0 | 55.00 | 95.46 | 3767.66 |
| max | 1.0 | 72.00 | 119.99 | 7888.84 |
In [ ]: